1 Introduction


Objectives: The goal of this kernel is to analyze the effect and flavor of 3 different type of Cannabis.

EDA includes datatable, skim, and plotly and xgboost with histogram is used as the model for this analysis.

If you have any question, please leave a comment and if you like the kernel, please give me an upvote~ Thanks!


2 Basic Set up



2.1 Load Packages


library(tidyverse)
library(skimr)
library(highcharter)
# library(qdap)
library(tm)
library(plotly)
library(viridis)
library(wordcloud)
library(plotrix)
library(DescTools)
library(DescTools)
library(DT)

2.2 Load Dataset


weed <- read_csv("input/cannabis.csv")

3 Glimpses



3.1 datatable


weed %>% 
  datatable(filter = 'top', options = list(
  pageLength = 15, autoWidth = TRUE

))

3.2 skim


weed %>% skim() %>% kable()
## Skim summary statistics  
##  n obs: 2351    
##  n variables: 6    
## 
## Variable type: character
## 
## variable      missing   complete   n      min   max    empty   n_unique 
## ------------  --------  ---------  -----  ----  -----  ------  ---------
## Description   33        2318       2351   4     1120   0       2312     
## Effects       0         2351       2351   4     46     0       1655     
## Flavor        46        2305       2351   3     30     0       1293     
## Strain        0         2351       2351   2     30     0       2350     
## Type          0         2351       2351   6     6      0       3        
## 
## Variable type: numeric
## 
## variable   missing   complete   n      mean   sd     p0   p25   p50   p75   p100   hist     
## ---------  --------  ---------  -----  -----  -----  ---  ----  ----  ----  -----  ---------
## Rating     0         2351       2351   4.31   0.84   0    4.2   4.4   4.7   5      <U+2581><U+2581><U+2581><U+2581><U+2581><U+2581><U+2585><U+2587>

3.3 glimpse


weed %>% glimpse()
## Observations: 2,351
## Variables: 6
## $ Strain      <chr> "100-Og", "98-White-Widow", "1024", "13-Dawgs", "2...
## $ Type        <chr> "hybrid", "hybrid", "sativa", "hybrid", "hybrid", ...
## $ Rating      <dbl> 4.0, 4.7, 4.4, 4.2, 4.6, 0.0, 4.4, 4.2, 4.6, 4.4, ...
## $ Effects     <chr> "Creative,Energetic,Tingly,Euphoric,Relaxed", "Rel...
## $ Flavor      <chr> "Earthy,Sweet,Citrus", "Flowery,Violet,Diesel", "S...
## $ Description <chr> "$100 OG is a 50/50 hybrid strain that packs a str...

4 TreeMaps



4.1 TreeMap by Type


by_type <- weed %>% 
  count(Type)
hchart(by_type, type = 'treemap', hcaes(x = 'Type', value = 'n', color = 'n'))
rm(by_type)

4.2 TreeMap by Effects


weed_effects <- weed %>% 
  mutate(Effects = str_split(Effects,',')) %>% 
  unnest(Effects)


weed_effects %>% 
  count(Effects) %>% 
  hchart(type = 'treemap', hcaes(x = 'Effects', value = 'n', color = 'n'))

4.3 TreeMap by Flavors


weed_flavor <- weed %>% 
  filter(Flavor != 'none') %>% 
  mutate(Flavor = str_split(Flavor, ',')) %>% 
  unnest(Flavor)


weed_flavor %>% 
  count(Flavor) %>% 
  hchart(type = 'treemap', hcaes(x = 'Flavor', value = 'n', color = 'n'))

5 Drill Down Graph for 3 Types by Effects and by Flavors


df1 <- weed %>% 
  group_by(name = Type, drilldown = Type) %>% 
  summarise(y = n()) %>% 
  arrange(desc(y))

df2 <- weed_effects %>% 
  group_by(Type, Effects) %>% 
  mutate(y = n(), colorByPoint =  1) %>% 
  arrange(desc(y)) %>%
  group_by(name = Type, id = Type, colorByPoint) %>% 
  do(data = list_parse(
    mutate(.,name = Effects, drilldown = tolower(paste(Type, Effects,sep=": "))) %>% 
      group_by(name, drilldown) %>% 
      summarise(y=n()) %>% 
      select(name, y, drilldown) %>%
      arrange(desc(y)))) 

(a <- highchart() %>% 
  hc_chart(type = 'bar') %>% 
  hc_xAxis(type = "category") %>% 
  hc_add_series(name = 'number of cannabis', data = df1, colorByPoint = 1) %>% 
  hc_drilldown(
    allowPointDrilldown = TRUE,
    series =list_parse(df2)
  ) %>%
  hc_legend(enabled = F) %>% 
  hc_title(text = "Type of Cannbis vs Effects") %>% 
  hc_add_theme(hc_theme_darkunica()))
rm(df1, df2)




df1 <- weed %>% 
  group_by(name = Type, drilldown = Type) %>% 
  summarise(y = n()) %>% 
  arrange(desc(y))

df2 <- weed_flavor %>% 
  group_by(Type, Flavor) %>% 
  mutate(y = n(), colorByPoint = 1) %>% 
  arrange(desc(y)) %>%
  group_by(name = Type, id = Type, colorByPoint) %>% 
  do(data = list_parse(
    mutate(.,name = Flavor, drilldown = tolower(paste(Type, Flavor,sep=": "))) %>% 
      group_by(name, drilldown) %>% 
      summarise(y=n()) %>% 
      select(name, y, drilldown) %>%
      arrange(desc(y)))) 

b <- highchart() %>% 
  hc_chart(type = 'bar') %>% 
  hc_xAxis(type = "category") %>% 
  hc_add_series(name = 'number of cannabis', data = df1, colorByPoint = 1) %>% 
  hc_drilldown(
    allowPointDrilldown = TRUE,
    series =list_parse(df2)
  ) %>%
  hc_legend(enabled = F) %>% 
  hc_title(text = "Type of Cannbis vs Flavor") %>% 
  hc_add_theme(hc_theme_darkunica())
rm(df1, df2)



lst <- list(
  a,
  b
)



hw_grid(lst, rowheight = 400)
rm(a, b, lst)

6 NLP Setup



6.1 clean corpus


# clean corpus
cleanCorpus <- function(corpus){
  
  corpus.tmp <- tm_map(corpus, removePunctuation)
  corpus.tmp <- tm_map(corpus.tmp, stripWhitespace)
  corpus.tmp <- tm_map(corpus.tmp, content_transformer(tolower))
  v_stopwords <- c(stopwords("en"), c("thats","weve","hes","theres","ive","im",
                                      "will","can","cant","dont","youve","us",
                                      "youre","youll","theyre","whats","didnt"))
  corpus.tmp <- tm_map(corpus.tmp, removeWords, v_stopwords)
  corpus.tmp <- tm_map(corpus.tmp, removeNumbers)
  return(corpus.tmp)
}

6.2 frequent terms


# frequent terms 
frequentTerms <- function(text){
  
  s.cor <- Corpus(VectorSource(text))
  s.cor.cl <- cleanCorpus(s.cor)
  s.tdm <- TermDocumentMatrix(s.cor.cl)
  s.tdm <- removeSparseTerms(s.tdm, 0.999)
  m <- as.matrix(s.tdm)
  word_freqs <- sort(rowSums(m), decreasing=TRUE)
  dm <- data.frame(word=names(word_freqs), freq=word_freqs)
  return(dm)
  
}

6.3 clean by each type


# clean by each Type
clean_top_char <- function(dataset){
  all_dialogue <- list()
  namelist <- list()
  
  for (i in 1:3){
    top <- dataset %>% count(Type) %>% arrange(desc(n)) %>% head(20)
    name <- top$Type[i]
    Description <- paste(dataset$Description[dataset$Type == name], collapse = " ")
    all_dialogue <- c(all_dialogue, Description)
    namelist <- c(namelist, name)
    
  }
  
  
  
  all_clean <- all_dialogue %>% 
    VectorSource() %>% 
    Corpus() %>% 
    cleanCorpus() %>% 
    TermDocumentMatrix() %>%
    as.matrix()
  
  colnames(all_clean) <- namelist
  
  assign("all_clean",all_clean,.GlobalEnv)
  all_clean %>% head()
}

weed %>% clean_top_char()
##              Docs
## Terms         hybrid indica sativa
##   abandon          1      0      0
##   abate            8      5      5
##   abates           1      1      0
##   abating          3      1      0
##   abbreviated      1      0      0
##   abduct           1      0      0

7 Top 30 Words in Description


weed$Description %>% 
  frequentTerms() %>% 
  # dim()
  head(30) %>% 
  mutate(word = factor(word))%>% 
  plot_ly(x = ~reorder(word,-freq), y = ~freq, colors = viridis(10)) %>%
  add_bars(color = ~word) %>%
  layout(title = "Top 30 Words", 
         yaxis = list(title = " "), 
         xaxis = list(title = ""), 
         margin = list(l = 100))

8 WordCloud



8.1 Commonality Cloud


commonality.cloud(all_clean[,c("sativa","indica")], colors = "steelblue1", at.least = 2, max.words = 100)


8.2 Comparison Cloud


comparison.cloud(all_clean[,c("sativa","indica")], colors = c("#F8766D", "#00BFC4"), max.words=50)


9 Pramid Plot


common_words <- all_clean %>%
  as.data.frame() %>% 
  rownames_to_column() %>% 
  filter(sativa>0, indica>0) %>% 
  # select(sativa, indica)
  mutate(difference = abs(sativa - indica)) %>% 
  arrange(desc(difference)) 

common_words_25 <- common_words%>%
  head(25)

pyramid.plot(common_words_25$sativa, common_words_25$indica,
             labels = common_words_25$rowname, gap = 200,
             top.labels = c("sativa", "Words", "indica"),
             main = "Words in Common", laxlab = NULL, 
             raxlab = NULL, unit = NULL)

## [1] 5.1 4.1 4.1 2.1
rm(common_words, common_words_25)

10 3D Plotly


effects <- weed_effects$Effects %>% unique() %>% tolower()
rm(weed_effects)


effectByType <- all_clean %>%
  as.data.frame() %>% 
  rownames_to_column('word') %>% 
  filter(word %in% effects) %>% 
  mutate(word=factor(word))
  

effectByType %>% 
  plot_ly(x=~hybrid,y=~sativa,z= ~indica, color=~word, hoverinfo = 'text', colors = viridis(15),
          text = ~paste('Effects:', word,
                        '<br>hybrid:', hybrid,
                        '<br>sativa:', sativa,
                        '<br>indica:', indica)) %>% 
  add_markers(opacity = 0.8) %>%
  layout(title = "Effects by Different Cannabis",
         annotations=list(yref='paper',xref="paper",y=1.05,x=1.1, text="Effects",showarrow=F),
         scene = list(xaxis = list(title = 'hybrid'),
                      yaxis = list(title = 'sativa'),
                      zaxis = list(title = 'indica')))

11 Conclusion


Hope you enjoyed the kernel and don’t forget to upvote~ Thanks a lot!